(7)Identify highly variable genes
createSeuratObjEachSample <- function(project.name,
dir,
min.cells = 3,
min.features = 500,
max.features = 6000,
assay = "RNA",
max.mt.percentage = 20,
num.hvf = 2000,
mt.pattern = "^mt-",
normalization.method = "LogNormalize",
hvf.selection.method = "vst"){
#### (1) Read files
cat(c("Read files\n"))
barcode.path <- file.path(dir, "barcodes.tsv")
features.path <- file.path(dir, "genes.tsv")
matrix.path <- file.path(dir, "matrix.mtx")
mat <- Matrix::readMM(file = matrix.path)
feature.names = read.delim(features.path,
header = FALSE,
stringsAsFactors = FALSE)
barcode.names = read.delim(barcode.path,
header = FALSE,
stringsAsFactors = FALSE)
colnames(mat) = paste(project.name, barcode.names$V1, sep = "_")
rownames(mat) = feature.names$V2
mat <- as.matrix(mat)
cat(c("-> matrix:", dim(mat), "\n"))
#### (2) 將基因名子一樣的資料加總起來
cat(c("Remove duplicated genes\n"))
IDfreqs <- table(rownames(mat))
IDfreqs <- IDfreqs[IDfreqs > 1]
cat(c("-> Num. dup. genes:", length(IDfreqs), "\n"))
if (length(IDfreqs) > 0){
for (i in names(IDfreqs)) {
index <- which(rownames(mat) == i)
duplicates <- as.matrix(mat[index, ])
sums <- apply(duplicates, 2, sum, na.rm=TRUE)
### 置換成加總數值
mat[index[1],] <- sums
### 將重複的資料移除
for (j in length(index):2) {
mat <- mat[-index[j], ]
}
}
}
cat(c("-> matrix:", dim(mat), "\n"))
#### (3) 資料載入seurat
cat(c("Create Seurat Object\n"))
seurat.obj <- Seurat::CreateSeuratObject(counts = mat,
project = project.name,
min.cells = min.cells,
min.features = min.features,
assay = assay,
meta.data = NULL)
cat(c("-> dimension:", dim(seurat.obj), "\n"))
#### (4) 計算mito gene的比例
cat(c("Calculate MT percentage\n"))
seurat.obj[["Mito_percent"]] <- Seurat::PercentageFeatureSet(seurat.obj, pattern = mt.pattern)
#### (5) Cell QC
cat(c("Cell QC\n"))
min_nFeature <- min.features
max_nFeature <- max.features
max_Mito_percent <- max.mt.percentage
seurat.obj <- seurat.obj %>%
subset(., subset = nFeature_RNA > min_nFeature & nFeature_RNA < max_nFeature) %>%
subset(., subset = Mito_percent < max_Mito_percent)
cat(c("-> dimension:", dim(seurat.obj), "\n"))
#### (6) Normalization
cat(c("Normalization\n"))
seurat.obj <- Seurat::NormalizeData(seurat.obj,
normalization.method = normalization.method,
scale.factor = 10000)
#### (7) Identify highly variable genes
cat(c("High variable genes\n"))
seurat.obj <- Seurat::FindVariableFeatures(seurat.obj,
selection.method = hvf.selection.method,
nfeatures = num.hvf
)
return(seurat.obj)
}
k_seurat <- createSeuratObjEachSample (project.nam = "k",
dir = file.path(data.dir, "K"),
min.cells = 3,
min.features = 500,
max.features = 6000,
assay = "RNA",
max.mt.percentage = 20,
num.hvf = 2000,
mt.pattern = "^mt-",
normalization.method = "LogNormalize",
hvf.selection.method = "vst")
## Read files
## Warning in asMethod(object): sparse->dense coercion: allocating vector of size
## 1.0 GiB
## -> matrix: 20304 6696
## Remove duplicated genes
## -> Num. dup. genes: 0
## -> matrix: 20304 6696
## Create Seurat Object
## Warning: Data is of class matrix. Coercing to dgCMatrix.
## -> dimension: 18738 6696
## Calculate MT percentage
## Cell QC
## -> dimension: 18738 6696
## Normalization
## Normalizing layer: counts
## High variable genes
## Finding variable features for layer counts
kl_seurat <- createSeuratObjEachSample (project.nam = "kl",
dir = file.path(data.dir, "KL"),
min.cells = 3,
min.features = 500,
max.features = 6000,
assay = "RNA",
max.mt.percentage = 20,
num.hvf = 2000,
mt.pattern = "^mt-",
normalization.method = "LogNormalize",
hvf.selection.method = "vst")
## Read files
## Warning in asMethod(object): sparse->dense coercion: allocating vector of size
## 1.1 GiB
## -> matrix: 20304 7564
## Remove duplicated genes
## -> Num. dup. genes: 0
## -> matrix: 20304 7564
## Create Seurat Object
## Warning: Data is of class matrix. Coercing to dgCMatrix.
## -> dimension: 19458 7564
## Calculate MT percentage
## Cell QC
## -> dimension: 19458 7564
## Normalization
## Normalizing layer: counts
## High variable genes
## Finding variable features for layer counts
# 觀察兩個Seurat object
VlnPlot(k_seurat, features = c("nFeature_RNA", "nCount_RNA", "Mito_percent"), ncol = 3)

VlnPlot(kl_seurat, features = c("nFeature_RNA", "nCount_RNA", "Mito_percent"), ncol = 3)

### 2. Data integration --------------------------------------------------------
# Create an ‘integrated’ data assay for downstream analysis
# Identify cell types that are present in both datasets
# Obtain cell type markers that are conserved in both control and stimulated cells
# Compare the datasets to find cell-type specific responses to stimulation
data.list <- list("k" = k_seurat, "kl" = kl_seurat)
# select features that are repeatedly variable across datasets for integration
features <- Seurat::SelectIntegrationFeatures(object.list = data.list,
nfeatures = 2000,
verbose = FALSE)
# identify anchors
anchors <- Seurat::FindIntegrationAnchors(object.list = data.list,
anchor.features = features,
verbose = F)
# this command creates an 'integrated' data assay
combined.obj <- Seurat::IntegrateData(anchorset = anchors)
## Merging dataset 1 into 2
## Extracting anchors for merged samples
## Finding integration vectors
## Finding integration vector weights
## Integrating data
DefaultAssay(combined.obj) <- "integrated"
dim(combined.obj)
## [1] 2000 14260
combined.obj@meta.data$orig.ident %>% table()
## .
## k kl
## 6696 7564
combined.obj@meta.data %>% head()
## orig.ident nCount_RNA nFeature_RNA Mito_percent
## k_AAACCCAAGACAAGCC k 5388 1842 3.0252413
## k_AAACCCAAGATCGACG k 8021 2329 2.9173420
## k_AAACCCAAGGCTGTAG k 7292 2613 2.6330225
## k_AAACCCACAATAGAGT k 2706 912 0.4065041
## k_AAACCCACAGATTAAG k 4832 1579 3.7251656
## k_AAACCCAGTATGGTAA k 8292 2217 3.3043898
### 3. Run the standard workflow for visualization and clustering --------------
combined.obj <- Seurat::ScaleData(combined.obj, verbose = FALSE)
combined.obj <- Seurat::RunPCA(combined.obj, npcs = 50, verbose = FALSE)
ElbowPlot(combined.obj, ndims = 50)

pcs <- 20 # 肉眼判斷為20(符合paper作法)
# visualizing both cells and features that define the PCA
VizDimLoadings(combined.obj, dims = 1:2, reduction = "pca")

DimPlot(combined.obj, reduction = "pca") + NoLegend()

DimHeatmap(combined.obj, dims = 1, cells = 500, balanced = TRUE)

### 4. Run non-linear dimensional reduction (UMAP/tSNE)
combined.obj <- combined.obj %>%
Seurat::RunUMAP(reduction = "pca", umap.method = "uwot", dims = 1:pcs, verbose = FALSE) %>%
Seurat::RunTSNE(reduction="pca", dims= 1:pcs, verbose = FALSE) %>%
Seurat::FindNeighbors(reduction = "pca", dims = 1:pcs, verbose = FALSE) %>%
Seurat::FindClusters(resolution = 0.06, verbose = FALSE) %>% # paper提供resolution = 0.06
Seurat::FindSubCluster("0",resolution = 0.06, graph.name = "integrated_snn",
subcluster.name = "new cluster") # T cell進一步subcluster
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 4739
## Number of edges: 192814
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9576
## Number of communities: 2
## Elapsed time: 0 seconds
# 觀察cluster狀況
Seurat::DimPlot(combined.obj, reduction="umap")

Seurat::DimPlot(combined.obj, reduction="tsne")

Seurat::DimPlot(combined.obj, reduction="tsne", group.by = "orig.ident")

Seurat::DimPlot(combined.obj, reduction="tsne", group.by = "new cluster")

# 觀察Feature狀況
FeaturePlot(combined.obj, reduction="umap", features = "Mito_percent")

### 5. Find markers for each cluster -------------------------------------------
markers <- FindAllMarkers(combined.obj, only.pos = TRUE) %>%
group_by(cluster) %>%
dplyr::filter(avg_log2FC > 1) %>%
dplyr::arrange(desc(avg_log2FC)) %>%
slice_head(n = 10) %>%
ungroup() -> top10
## Calculating cluster 0
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Calculating cluster 1
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Calculating cluster 2
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Calculating cluster 3
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Calculating cluster 4
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Warning in mean.fxn(object[features, cells.2, drop = FALSE]): NaNs produced
## Calculating cluster 5
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Calculating cluster 6
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Calculating cluster 7
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Calculating cluster 8
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Calculating cluster 9
# 將找到的marker進行視覺化,作heatmap
Seurat::DoHeatmap(combined.obj, group.by = "ident", features = top10$gene,draw.lines = TRUE) + NoLegend()

head(markers)
## # A tibble: 6 × 7
## p_val avg_log2FC pct.1 pct.2 p_val_adj cluster gene
## <dbl> <dbl> <dbl> <dbl> <dbl> <fct> <chr>
## 1 8.33e- 63 5.01 0.155 0.061 1.67e- 59 0 Il17f
## 2 4.75e-180 4.99 0.219 0.041 9.49e-177 0 Cd163l1
## 3 0 4.71 0.465 0.135 0 0 Dapl1
## 4 1.75e-127 4.68 0.134 0.036 3.49e-124 0 Trgv2
## 5 0 4.63 0.121 0.002 0 0 Trav4-2
## 6 3.39e- 62 4.46 0.186 0.091 6.78e- 59 0 Tcrg-C1
# 可以將特定gene作圖
FeaturePlot(combined.obj, reduction="umap", features = "Cd19", label = T)
## Warning: Could not find Cd19 in the default search locations, found in 'RNA'
## assay instead

VlnPlot(combined.obj, features = c("Cd19", "Cd3e"))
## Warning: Could not find Cd19 in the default search locations, found in 'RNA'
## assay instead
## Warning: Could not find Cd3e in the default search locations, found in 'RNA'
## assay instead

### 6.annotation(from paper) -----------------------------------------------------
b_cell_mark <- c("Cd19","Cd22","Cd79a","Cd79b")
cancer_mark <- c("Epcam","Krt18","Krt8","Krt19")
dendritic_mark <- c("Cd86","Cd83","Clec10a","Ccl22","Cxcl16","Xcr1","Rab7b","Naaa","Btla")
endo_mark <- c("Pecam1","Tek","Plvap","Thbd","Tmem100","Adgrf5","Podxl","Nostrin","Acvrl1")
macrophage_mark <- c("Adgre1","Itgam","Itgal","Cd14","Ccl9","F13a1","Cx3cr1","Cd68")
neutrophil_mark <- c("Csf3r","Cxcr2S100a8")
nk_mark <- c("Klrd1","Nkg7","Prf1","Gzma","Gzmb","Ccl5","Cma1","Klre1","Klra4")
plasma_dend_mark <- c("Ccr9","Bst2")
t_cell_mark <- c("Cd3d","Cd5","Cd3e")
# 觀察cell cluster marker是否能夠代表該cluster
FeaturePlot(combined.obj, reduction="umap", features = b_cell_mark[1:4], label = T)
## Warning: Could not find Cd19 in the default search locations, found in 'RNA'
## assay instead
## Warning: Could not find Cd22 in the default search locations, found in 'RNA'
## assay instead

FeaturePlot(combined.obj, reduction="umap", features = cancer_mark[1:4], label = T)

FeaturePlot(combined.obj, reduction="umap", features = dendritic_mark[1:4], label = T)

FeaturePlot(combined.obj, reduction="umap", features = endo_mark[1:4], label = T)

FeaturePlot(combined.obj, reduction="umap", features = macrophage_mark[1:4], label = T)
## Warning: Could not find Itgal in the default search locations, found in 'RNA'
## assay instead

FeaturePlot(combined.obj, reduction="umap", features = neutrophil_mark, label = T)
## Warning: The following requested variables were not found: Cxcr2S100a8

FeaturePlot(combined.obj, reduction="umap", features = nk_mark[1:4], label = T)

FeaturePlot(combined.obj, reduction="umap", features = plasma_dend_mark, label = T)

# T cell
p1 <- FeaturePlot(combined.obj, reduction="umap", features = "Cd3e", label = T) # T cell
## Warning: Could not find Cd3e in the default search locations, found in 'RNA'
## assay instead
p2 <- FeaturePlot(combined.obj, reduction="umap", features = "Tcf7", label = T) # activated T cell
## Warning: Could not find Tcf7 in the default search locations, found in 'RNA'
## assay instead
p3 <- FeaturePlot(combined.obj, reduction="umap", features = "Ctla4", label = T) # exhausted T cell
cowplot::plot_grid(p1, p2, p3, ncol = 3)

VlnPlot(combined.obj, features = c("Cd3e","Tcf7","Ctla4"))
## Warning: Could not find Cd3e in the default search locations, found in 'RNA'
## assay instead
## Warning: Could not find Tcf7 in the default search locations, found in 'RNA'
## assay instead

# cluster annotation
combined.obj$`new cluster`[combined.obj$`new cluster` == "0_0"] <- "T cell activated"
combined.obj$`new cluster`[combined.obj$`new cluster` == "0_1"] <- "T cell exhausted"
combined.obj$`new cluster`[combined.obj$`new cluster` == "1"] <- "Endothelial cell"
combined.obj$`new cluster`[combined.obj$`new cluster` == "2"] <- "Macrophage"
combined.obj$`new cluster`[combined.obj$`new cluster` == "3"] <- "B cell"
combined.obj$`new cluster`[combined.obj$`new cluster` == "4"] <- "Neutrophils"
combined.obj$`new cluster`[combined.obj$`new cluster` == "5"] <- "dendritic cell"
combined.obj$`new cluster`[combined.obj$`new cluster` == "6"] <- "NK cell"
combined.obj$`new cluster`[combined.obj$`new cluster` == "7"] <- "cancer cell1"
combined.obj$`new cluster`[combined.obj$`new cluster` == "8"] <- "cancer cell2"
combined.obj$`new cluster`[combined.obj$`new cluster` == "9"] <- "Plasmacytoid dendritic cell"
combined.obj <- SetIdent(combined.obj, value = combined.obj@meta.data$`new cluster`)
Seurat::DimPlot(combined.obj, reduction="umap", label = T)

# visualization
features <- c("Cd3e","Ctla4")
# Ridge plots - from ggridges. Visualize single cell expression distributions in each cluster
RidgePlot(combined.obj, features = features, ncol = 2)
## Warning: Could not find Cd3e in the default search locations, found in 'RNA'
## assay instead
## Picking joint bandwidth of 0.0237
## Picking joint bandwidth of 0.022

# Violin plot - Visualize single cell expression distributions in each cluster
VlnPlot(combined.obj, features = features, ncol = 2)
## Warning: Could not find Cd3e in the default search locations, found in 'RNA'
## assay instead

VlnPlot(combined.obj, features = features, split.by = "orig.ident")
## The default behaviour of split.by has changed.
## Separate violin plots are now plotted side-by-side.
## To restore the old behaviour of a single split violin,
## set split.plot = TRUE.
##
## This message will be shown once per session.
## Warning: Could not find Cd3e in the default search locations, found in 'RNA'
## assay instead

# Feature plot - visualize feature expression in low-dimensional space
FeaturePlot(combined.obj, features = features)
## Warning: Could not find Cd3e in the default search locations, found in 'RNA'
## assay instead

FeaturePlot(combined.obj, features = features, reduction = "umap",blend = TRUE)
## Warning: Could not find Cd3e in the default search locations, found in 'RNA'
## assay instead

FeaturePlot(combined.obj, features = features, split.by = "orig.ident")
## Warning: Could not find Cd3e in the default search locations, found in 'RNA'
## assay instead

# Dot plots - the size of the dot corresponds to the percentage of cells expressing the
# feature in each cluster. The color represents the average expression level
DotPlot(combined.obj, features = features) + RotatedAxis()
## Warning: Could not find Cd3e in the default search locations, found in 'RNA'
## assay instead

# Single cell heatmap of feature expression
DoHeatmap(subset(combined.obj, downsample = 100), features = top10$gene, size = 3) + NoLegend()

# Identify conserved cell type markers
sg <- list(bcell = c("Cd19","Cd22","Cd79a","Cd79b"),
cancer = c("Epcam","Krt18","Krt8","Krt19"),
dendritic = c("Cd86","Cd83","Clec10a","Ccl22","Cxcl16","Xcr1","Rab7b","Naaa","Btla"),
endo = c("Pecam1","Tek","Plvap","Thbd","Tmem100","Adgrf5","Podxl","Nostrin","Acvrl1"),
macrophage = c("Adgre1","Itgam","Itgal","Cd14","Ccl9","F13a1","Cx3cr1","Cd68"),
neutro= c("Csf3r","Cxcr2S100a8"),
nkcell = c("Klrd1","Nkg7","Prf1","Gzma","Gzmb","Ccl5","Cma1","Klre1","Klra4"),
plasma_dend = c("Ccr9","Bst2"),
tcell = c("Cd3d","Cd5","Cd3e"))
DotPlot(combined.obj, features = sg, cols = c("blue", "red"), dot.scale = 4, assay = "RNA") +
RotatedAxis()
## Warning: The following requested variables were not found: Cxcr2S100a8
## Warning: The `facets` argument of `facet_grid()` is deprecated as of ggplot2 2.2.0.
## ℹ Please use the `rows` argument instead.
## ℹ The deprecated feature was likely used in the Seurat package.
## Please report the issue at <https://github.com/satijalab/seurat/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
